This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

# library for LMM 

library(lme4)
library(lmerTest)
library(car)
Loading required package: carData
df<-read.csv("input/scores_commits.csv", header =TRUE, sep=",")
df <- df[complete.cases(df), ]                  # Apply complete.cases function
df
# convert to nominal factor
df$Group = factor(df$Group)
df$phase = factor(df$phase)
df$log_novelty <- log(df$novelty+1) 
df$log_user_requirement <- log(df$user.requirement+1)
df$log_infovis <- log(df$infovis+1)
df$log_total <- log(df$total+1)
df$log_count <- log(df$count+1)
df$Q7_Q7_1 <- log(df$Q7_Q7_1+1)
df$Q7_Q7_2 <- log(df$Q7_Q7_2+1)
df$Q8_Q8_1 <- log(df$Q8_Q8_1+1)
df$Q10 <- log(df$Q10+1)
# standardizing variables for skills and aspirations. 
cols <- c("Q7_Q7_1", "Q7_Q7_2", "Q8_Q8_1", "Q10", "log_novelty", "log_user_requirement", "log_infovis", "log_total", "log_count")
df[cols] <- scale(df[cols])
df
mod.reduce.novelty <- lm( log_novelty ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data = df)
summary(mod.reduce.novelty)

Call:
lm(formula = log_novelty ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + 
    Q10, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8291 -0.8409  0.1955  0.8140  1.7601 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1.457e-15  3.280e-02   0.000  1.00000    
log_count    2.970e-01  3.313e-02   8.964  < 2e-16 ***
Q7_Q7_1     -1.967e-01  4.120e-02  -4.775 2.14e-06 ***
Q7_Q7_2      1.805e-01  4.223e-02   4.275 2.14e-05 ***
Q8_Q8_1      3.738e-03  3.627e-02   0.103  0.91795    
Q10          1.034e-01  3.539e-02   2.921  0.00359 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9276 on 794 degrees of freedom
Multiple R-squared:  0.1449,    Adjusted R-squared:  0.1395 
F-statistic: 26.91 on 5 and 794 DF,  p-value: < 2.2e-16
AIC(mod.reduce.novelty)
[1] 2158.067
BIC(mod.reduce.novelty)
[1] 2190.859
mod.full.novelty <- lm( log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data = df)
summary(mod.full.novelty)

Call:
lm(formula = log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + 
    Q7_Q7_2 + Q8_Q8_1 + Q10, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8606 -0.8544  0.1794  0.8162  1.8526 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -0.129451   0.068285  -1.896  0.05836 .  
factor(Group)1  0.237230   0.094751   2.504  0.01249 *  
factor(Group)2  0.168224   0.094639   1.778  0.07586 .  
factor(Group)3  0.099722   0.094145   1.059  0.28981    
log_count       0.292623   0.033134   8.832  < 2e-16 ***
Q7_Q7_1        -0.199272   0.041287  -4.826 1.67e-06 ***
Q7_Q7_2         0.178203   0.042382   4.205 2.91e-05 ***
Q8_Q8_1         0.001541   0.036257   0.043  0.96611    
Q10             0.099003   0.035891   2.758  0.00594 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9254 on 791 degrees of freedom
Multiple R-squared:  0.1523,    Adjusted R-squared:  0.1437 
F-statistic: 17.76 on 8 and 791 DF,  p-value: < 2.2e-16
AIC(mod.full.novelty)
[1] 2157.128
BIC(mod.full.novelty)
[1] 2203.974
anova(mod.reduce.novelty, mod.full.novelty)
Analysis of Variance Table

Model 1: log_novelty ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10
Model 2: log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + 
    Q8_Q8_1 + Q10
  Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
1    794 683.22                              
2    791 677.32  3    5.9001 2.2968 0.07633 .
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
library(ALSM)
Loading required package: leaps
Loading required package: SuppDists
step(lm(log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data=df),
method="both", trace = 1 )
Start:  AIC=-115.17
log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + 
    Q8_Q8_1 + Q10

                Df Sum of Sq    RSS      AIC
- Q8_Q8_1        1     0.002 677.32 -117.172
<none>                       677.32 -115.173
- factor(Group)  3     5.900 683.22 -114.235
- Q10            1     6.515 683.84 -109.515
- Q7_Q7_2        1    15.138 692.46  -99.490
- Q7_Q7_1        1    19.947 697.27  -93.954
- log_count      1    66.787 744.11  -41.940

Step:  AIC=-117.17
log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + 
    Q10

                Df Sum of Sq    RSS      AIC
<none>                       677.32 -117.172
- factor(Group)  3     5.908 683.23 -116.224
- Q10            1     7.106 684.43 -110.823
- Q7_Q7_2        1    15.596 692.92 -100.960
- Q7_Q7_1        1    20.066 697.39  -95.815
- log_count      1    67.067 744.39  -43.638

Call:
lm(formula = log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + 
    Q7_Q7_2 + Q10, data = df)

Coefficients:
   (Intercept)  factor(Group)1  factor(Group)2  factor(Group)3       log_count         Q7_Q7_1         Q7_Q7_2  
      -0.12958         0.23732         0.16843         0.09995         0.29271        -0.19912         0.17849  
           Q10  
       0.09942  
mod.reduce.ur <- lm(log_user_requirement ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(mod.reduce.ur)

Call:
lm(formula = log_user_requirement ~ log_count + Q7_Q7_1 + Q7_Q7_2 + 
    Q8_Q8_1 + Q10, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.9234 -0.9854  0.3754  0.7441  1.6632 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.509e-16  3.348e-02   0.000 1.000000    
log_count    2.476e-01  3.383e-02   7.319 6.11e-13 ***
Q7_Q7_1     -1.897e-01  4.207e-02  -4.509 7.51e-06 ***
Q7_Q7_2      1.157e-01  4.311e-02   2.685 0.007412 ** 
Q8_Q8_1     -4.640e-03  3.703e-02  -0.125 0.900327    
Q10          1.194e-01  3.613e-02   3.305 0.000993 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9471 on 794 degrees of freedom
Multiple R-squared:  0.1086,    Adjusted R-squared:  0.103 
F-statistic: 19.36 on 5 and 794 DF,  p-value: < 2.2e-16
AIC(mod.reduce.ur)
[1] 2191.288
BIC(mod.reduce.ur)
[1] 2224.081
mod.full.ur <- lm(log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(mod.full.ur)

Call:
lm(formula = log_user_requirement ~ factor(Group) + log_count + 
    Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.0497 -0.9697  0.3109  0.7168  1.7587 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -0.20771    0.06930  -2.997 0.002809 ** 
factor(Group)1  0.30843    0.09616   3.207 0.001393 ** 
factor(Group)2  0.15234    0.09604   1.586 0.113097    
factor(Group)3  0.34981    0.09554   3.661 0.000268 ***
log_count       0.23922    0.03363   7.114 2.53e-12 ***
Q7_Q7_1        -0.19761    0.04190  -4.716 2.84e-06 ***
Q7_Q7_2         0.11979    0.04301   2.785 0.005482 ** 
Q8_Q8_1        -0.01073    0.03680  -0.292 0.770735    
Q10             0.11094    0.03642   3.046 0.002398 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9391 on 791 degrees of freedom
Multiple R-squared:  0.1269,    Adjusted R-squared:  0.1181 
F-statistic: 14.37 on 8 and 791 DF,  p-value: < 2.2e-16
AIC(mod.full.ur)
[1] 2180.715
BIC(mod.full.ur)
[1] 2227.561
anova(mod.reduce.ur, mod.full.ur)
Analysis of Variance Table

Model 1: log_user_requirement ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + 
    Q10
Model 2: log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 + 
    Q7_Q7_2 + Q8_Q8_1 + Q10
  Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
1    794 712.19                                  
2    791 697.59  3    14.602 5.5192 0.0009401 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
library(ALSM)
step(lm(log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data=df),
method="both", trace = 1 )
Start:  AIC=-91.59
log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 + 
    Q7_Q7_2 + Q8_Q8_1 + Q10

                Df Sum of Sq    RSS     AIC
- Q8_Q8_1        1     0.075 697.66 -93.500
<none>                       697.59 -91.586
- Q7_Q7_2        1     6.840 704.43 -85.780
- Q10            1     8.181 705.77 -84.258
- factor(Group)  3    14.602 712.19 -81.013
- Q7_Q7_1        1    19.617 717.20 -71.400
- log_count      1    44.633 742.22 -43.971

Step:  AIC=-93.5
log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 + 
    Q7_Q7_2 + Q10

                Df Sum of Sq    RSS     AIC
<none>                       697.66 -93.500
- Q7_Q7_2        1     6.788 704.45 -87.755
- Q10            1     8.389 706.05 -85.938
- factor(Group)  3    14.541 712.20 -82.997
- Q7_Q7_1        1    19.975 717.64 -72.917
- log_count      1    44.575 742.24 -45.954

Call:
lm(formula = log_user_requirement ~ factor(Group) + log_count + 
    Q7_Q7_1 + Q7_Q7_2 + Q10, data = df)

Coefficients:
   (Intercept)  factor(Group)1  factor(Group)2  factor(Group)3       log_count         Q7_Q7_1         Q7_Q7_2  
       -0.2068          0.3078          0.1509          0.3482          0.2386         -0.1987          0.1178  
           Q10  
        0.1080  
mod.reduce.vis <- lmer( log_infovis ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | Student) + ( 1 | phase) , data = df, REML = FALSE)
summary(mod.reduce.vis)
Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: log_infovis ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 +      (1 | Student) + (1 | phase)
   Data: df

     AIC      BIC   logLik deviance df.resid 
  1686.8   1729.0   -834.4   1668.8      791 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.3459 -0.2711  0.0568  0.4528  3.7219 

Random effects:
 Groups   Name        Variance Std.Dev.
 Student  (Intercept) 0.56556  0.7520  
 phase    (Intercept) 0.04482  0.2117  
 Residual             0.28839  0.5370  
Number of obs: 800, groups:  Student, 159; phase, 5

Fixed effects:
              Estimate Std. Error         df t value Pr(>|t|)    
(Intercept)  -0.004387   0.113506   9.331211  -0.039 0.969986    
log_count     0.101909   0.026572 738.704420   3.835 0.000136 ***
Q7_Q7_1      -0.207401   0.078496 155.246118  -2.642 0.009080 ** 
Q7_Q7_2       0.220425   0.080348 153.994778   2.743 0.006803 ** 
Q8_Q8_1      -0.063161   0.068976 153.572132  -0.916 0.361259    
Q10           0.113008   0.067332 153.554567   1.678 0.095310 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
          (Intr) lg_cnt Q7_Q7_1 Q7_Q7_2 Q8_Q8_
log_count  0.002                              
Q7_Q7_1    0.005  0.043                       
Q7_Q7_2   -0.002 -0.023 -0.556                
Q8_Q8_1    0.002 -0.024 -0.075  -0.172        
Q10        0.002 -0.025 -0.009  -0.126  -0.275
AIC(mod.reduce.vis)
[1] 1686.82
BIC(mod.reduce.vis)
[1] 1728.981
mod.full.vis <- lmer( log_infovis ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | Student) + ( 1 | phase) , data = df, REML = FALSE)
summary(mod.full.vis)
Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: log_infovis ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 +      Q8_Q8_1 + Q10 + (1 | Student) + (1 | phase)
   Data: df

     AIC      BIC   logLik deviance df.resid 
  1686.0   1742.3   -831.0   1662.0      788 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.3644 -0.2699  0.0488  0.4521  3.7079 

Random effects:
 Groups   Name        Variance Std.Dev.
 Student  (Intercept) 0.53930  0.7344  
 phase    (Intercept) 0.04475  0.2115  
 Residual             0.28841  0.5370  
Number of obs: 800, groups:  Student, 159; phase, 5

Fixed effects:
                Estimate Std. Error        df t value Pr(>|t|)    
(Intercept)     -0.28898    0.15869  32.79179  -1.821 0.077738 .  
factor(Group)1   0.39368    0.17736 153.34225   2.220 0.027905 *  
factor(Group)2   0.30674    0.17669 153.35371   1.736 0.084557 .  
factor(Group)3   0.41230    0.17566 153.43206   2.347 0.020195 *  
log_count        0.10225    0.02653 741.52333   3.854 0.000126 ***
Q7_Q7_1         -0.21031    0.07709 155.17239  -2.728 0.007106 ** 
Q7_Q7_2          0.21722    0.07909 153.92204   2.746 0.006744 ** 
Q8_Q8_1         -0.07213    0.06766 153.47966  -1.066 0.288008    
Q10              0.10932    0.06698 153.45530   1.632 0.104668    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr) fc(G)1 fc(G)2 fc(G)3 lg_cnt Q7_Q7_1 Q7_Q7_2 Q8_Q8_
factr(Grp)1 -0.577                                                   
factr(Grp)2 -0.580  0.511                                            
factr(Grp)3 -0.583  0.522  0.525                                     
log_count    0.014 -0.023 -0.005 -0.018                              
Q7_Q7_1      0.001 -0.031  0.047 -0.009  0.045                       
Q7_Q7_2      0.026  0.001 -0.089 -0.010 -0.025 -0.559                
Q8_Q8_1      0.038 -0.021 -0.051 -0.059 -0.024 -0.077  -0.167        
Q10         -0.007 -0.081  0.095  0.015 -0.022  0.005  -0.140  -0.276
AIC(mod.full.vis)
[1] 1686.045
BIC(mod.full.vis)
[1] 1742.26
anova(mod.reduce.vis, mod.full.vis)
Data: df
Models:
mod.reduce.vis: log_infovis ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | Student) + (1 | phase)
mod.full.vis: log_infovis ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | Student) + (1 | phase)
               npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)  
mod.reduce.vis    9 1686.8 1729.0 -834.41   1668.8                       
mod.full.vis     12 1686.0 1742.3 -831.02   1662.0 6.7749  3    0.07943 .
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
library(ALSM)
step(lm(log_infovis ~ Group + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data=df),
method="both", trace = 1 )
Start:  AIC=-131.12
log_infovis ~ Group + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + 
    Q10

            Df Sum of Sq    RSS      AIC
<none>                   663.96 -131.117
- Q8_Q8_1    1     4.428 668.38 -127.799
- Q10        1     6.615 670.57 -125.186
- Group      3    18.523 682.48 -115.104
- Q7_Q7_1    1    16.528 680.48 -113.446
- Q7_Q7_2    1    19.340 683.30 -110.147
- log_count  1    68.596 732.55  -54.463

Call:
lm(formula = log_infovis ~ Group + log_count + Q7_Q7_1 + Q7_Q7_2 + 
    Q8_Q8_1 + Q10, data = df)

Coefficients:
(Intercept)       Group1       Group2       Group3    log_count      Q7_Q7_1      Q7_Q7_2      Q8_Q8_1          Q10  
   -0.27221      0.37203      0.30113      0.38910      0.29656     -0.18139      0.20142     -0.08245      0.09975  
mod.reduce.total <- lm( log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(mod.reduce.total)

Call:
lm(formula = log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + 
    Q10, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.1861 -0.1993  0.2443  0.5703  1.4738 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1.422e-16  3.222e-02   0.000 1.000000    
log_count    3.419e-01  3.255e-02  10.503  < 2e-16 ***
Q7_Q7_1     -1.852e-01  4.048e-02  -4.576 5.51e-06 ***
Q7_Q7_2      1.875e-01  4.148e-02   4.520 7.12e-06 ***
Q8_Q8_1     -8.656e-02  3.563e-02  -2.429 0.015349 *  
Q10          1.241e-01  3.477e-02   3.570 0.000378 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9113 on 794 degrees of freedom
Multiple R-squared:  0.1748,    Adjusted R-squared:  0.1696 
F-statistic: 33.63 on 5 and 794 DF,  p-value: < 2.2e-16
AIC(mod.reduce.total)
[1] 2129.645
BIC(mod.reduce.total)
[1] 2162.437
mod.full.total <- lm( log_total ~ Group + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(mod.full.total)

Call:
lm(formula = log_total ~ Group + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + 
    Q10, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.9973 -0.1279  0.2773  0.5482  1.4041 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.29834    0.07061  -4.225 2.66e-05 ***
Group1       0.43251    0.09786   4.420 1.13e-05 ***
Group2       0.28093    0.09793   2.869 0.004231 ** 
Group3       0.45080    0.09734   4.631 4.25e-06 ***
Q7_Q7_1     -0.23066    0.04253  -5.424 7.76e-08 ***
Q7_Q7_2      0.20946    0.04380   4.783 2.06e-06 ***
Q8_Q8_1     -0.07307    0.03745  -1.951 0.051416 .  
Q10          0.13970    0.03708   3.767 0.000177 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9576 on 792 degrees of freedom
Multiple R-squared:  0.09105,   Adjusted R-squared:  0.08302 
F-statistic: 11.33 on 7 and 792 DF,  p-value: 9.643e-14
AIC(mod.full.total)
[1] 2210.925
BIC(mod.full.total)
[1] 2253.087
anova(mod.reduce.total, mod.full.total)
Analysis of Variance Table

Model 1: log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10
Model 2: log_total ~ Group + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10
  Res.Df    RSS Df Sum of Sq F Pr(>F)
1    794 659.37                      
2    792 726.25  2   -66.874         
# convert to nominal factor
df$Group = factor(df$Group)
df$phase = factor(df$phase)
library(plyr)
ddply(df, ~ Group * phase, function(data) summary(data$log_novelty) )
ddply(df, ~ Group * phase, summarise, log_novelty.mean=mean(log_novelty), log_novelty.sd = sd(log_novelty))
# histograms for two factors
boxplot(log_novelty ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_novelty")

with(df, interaction.plot(Group, phase, log_novelty, ylim=c(0, max(log_novelty)))) # interaction plot

# histograms for two factors
boxplot(log_novelty ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_user_requirement")

with(df, interaction.plot(Group, phase, log_user_requirement, ylim=c(0, max(log_user_requirement)))) # interaction plot

# histograms for two factors
boxplot(log_novelty ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_infovis")

with(df, interaction.plot(Group, phase, log_infovis, ylim=c(0, max(log_infovis)))) # interaction plot

# histograms for two factors
boxplot(log_novelty ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_total")

with(df, interaction.plot(Group, phase, log_total, ylim=c(0, max(log_total)))) # interaction plot

m = lmer(log_novelty ~ Group + (1|Student), data=df, REML=FALSE)
summary(m)
Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: log_novelty ~ Group + (1 | Student)
   Data: df

     AIC      BIC   logLik deviance df.resid 
  2046.0   2074.1  -1017.0   2034.0      794 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.2669 -0.6407  0.1015  0.6356  2.3143 

Random effects:
 Groups   Name        Variance Std.Dev.
 Student  (Intercept) 0.4473   0.6688  
 Residual             0.5366   0.7325  
Number of obs: 800, groups:  Student, 159

Fixed effects:
            Estimate Std. Error       df t value Pr(>|t|)  
(Intercept)  -0.1667     0.1224 159.4644  -1.361    0.175  
Group1        0.2933     0.1698 159.0131   1.728    0.086 .
Group2        0.1952     0.1689 159.4644   1.156    0.249  
Group3        0.1401     0.1689 159.4644   0.829    0.408  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
       (Intr) Group1 Group2
Group1 -0.721              
Group2 -0.725  0.523       
Group3 -0.725  0.523  0.526
plot(resid(m, type = "pearson") ~ fitted(m))

qqnorm(resid(m, type = "pearson"))
qqline(resid(m, type = "pearson"))

# library for LMM we will use on relational log_novelty 

library(lme4)
library(lmerTest)
library(car)

set sum-to-zero contrast for ANOVA cells

contrasts(df$Group) <= "contr.sum"
     1    2    3
0 TRUE TRUE TRUE
1 TRUE TRUE TRUE
2 TRUE TRUE TRUE
3 TRUE TRUE TRUE
contrasts(df$phase) <= "contr.sum"
     2    3    4    5
1 TRUE TRUE TRUE TRUE
2 TRUE TRUE TRUE TRUE
3 TRUE TRUE TRUE TRUE
4 TRUE TRUE TRUE TRUE
5 TRUE TRUE TRUE TRUE
# phase is nested within group 
fit <- lm(log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(fit)

Call:
lm(formula = log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + 
    Q10, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.1861 -0.1993  0.2443  0.5703  1.4738 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1.422e-16  3.222e-02   0.000 1.000000    
log_count    3.419e-01  3.255e-02  10.503  < 2e-16 ***
Q7_Q7_1     -1.852e-01  4.048e-02  -4.576 5.51e-06 ***
Q7_Q7_2      1.875e-01  4.148e-02   4.520 7.12e-06 ***
Q8_Q8_1     -8.656e-02  3.563e-02  -2.429 0.015349 *  
Q10          1.241e-01  3.477e-02   3.570 0.000378 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9113 on 794 degrees of freedom
Multiple R-squared:  0.1748,    Adjusted R-squared:  0.1696 
F-statistic: 33.63 on 5 and 794 DF,  p-value: < 2.2e-16
library(multcomp)
Loading required package: mvtnorm
Loading required package: survival
Loading required package: TH.data
Loading required package: MASS

Attaching package: ‘TH.data’

The following object is masked from ‘package:MASS’:

    geyser
library(lsmeans)
Loading required package: emmeans
The 'lsmeans' package is now basically a front end for 'emmeans'.
Users are encouraged to switch the rest of the way.
See help('transition') for more information, including how to
convert old 'lsmeans' objects and scripts to work with 'emmeans'.
#summary(glht(fit, lsm(pairwise ~ roup / phase)), test = adjusted(type='holm'))
fit.full <- lm(log_total ~ Group  + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data = df)
summary(fit.full)

Call:
lm(formula = log_total ~ Group + log_count + Q7_Q7_1 + Q7_Q7_2 + 
    Q8_Q8_1 + Q10, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.1191 -0.2201  0.2313  0.5645  1.3650 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.27052    0.06637  -4.076 5.04e-05 ***
Group1       0.37292    0.09209   4.050 5.64e-05 ***
Group2       0.27040    0.09198   2.940 0.003380 ** 
Group3       0.41236    0.09150   4.507 7.58e-06 ***
log_count    0.33294    0.03220  10.339  < 2e-16 ***
Q7_Q7_1     -0.19094    0.04013  -4.758 2.32e-06 ***
Q7_Q7_2      0.18702    0.04119   4.540 6.49e-06 ***
Q8_Q8_1     -0.09491    0.03524  -2.693 0.007223 ** 
Q10          0.11965    0.03488   3.430 0.000635 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.8994 on 791 degrees of freedom
Multiple R-squared:  0.1993,    Adjusted R-squared:  0.1912 
F-statistic: 24.61 on 8 and 791 DF,  p-value: < 2.2e-16
anova(fit, fit.full)
Analysis of Variance Table

Model 1: log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10
Model 2: log_total ~ Group + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + 
    Q10
  Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
1    794 659.37                                  
2    791 639.79  3    19.585 8.0715 2.664e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# histograms for two factors
boxplot(log_total ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_total")

with(df, interaction.plot(Group, phase, log_total, ylim=c(0, max(log_total)))) # interaction plot

fit.lmer <- lmer(log_total ~ (1 | Group) + (1| phase:Group), data = df, REML= FALSE)
summary(fit.lmer)
Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: log_total ~ (1 | Group) + (1 | phase:Group)
   Data: df

     AIC      BIC   logLik deviance df.resid 
  2254.0   2272.7  -1123.0   2246.0      796 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-2.94437 -0.01942  0.28792  0.61016  1.43336 

Random effects:
 Groups      Name        Variance Std.Dev.
 phase:Group (Intercept) 0.03675  0.1917  
 Group       (Intercept) 0.02060  0.1435  
 Residual                0.94281  0.9710  
Number of obs: 800, groups:  phase:Group, 20; Group, 4

Fixed effects:
             Estimate Std. Error        df t value Pr(>|t|)
(Intercept) -0.006239   0.090377  3.950735  -0.069    0.948
fit.lmer <- lmer(log_total ~ Group  + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | Student ) + ( 1 | phase), data = df, REML= FALSE)
summary(fit.lmer)
Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: log_total ~ Group + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 +      Q10 + (1 | Student) + (1 | phase)
   Data: df

     AIC      BIC   logLik deviance df.resid 
  1630.8   1687.0   -803.4   1606.8      788 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.1260 -0.2616  0.0485  0.3829  4.0648 

Random effects:
 Groups   Name        Variance Std.Dev.
 Student  (Intercept) 0.56076  0.7488  
 phase    (Intercept) 0.04637  0.2153  
 Residual             0.26263  0.5125  
Number of obs: 800, groups:  Student, 159; phase, 5

Fixed effects:
             Estimate Std. Error        df t value Pr(>|t|)   
(Intercept)  -0.29274    0.16109  32.72685  -1.817  0.07834 . 
Group1        0.40329    0.17976 151.08485   2.243  0.02632 * 
Group2        0.27793    0.17908 151.09189   1.552  0.12277   
Group3        0.44309    0.17804 151.16138   2.489  0.01390 * 
log_count     0.07472    0.02545 730.76115   2.936  0.00343 **
Q7_Q7_1      -0.22864    0.07811 153.04809  -2.927  0.00394 **
Q7_Q7_2       0.20767    0.08016 151.70100   2.591  0.01051 * 
Q8_Q8_1      -0.08091    0.06857 151.20635  -1.180  0.23989   
Q10           0.13263    0.06788 151.18265   1.954  0.05257 . 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
          (Intr) Group1 Group2 Group3 lg_cnt Q7_Q7_1 Q7_Q7_2 Q8_Q8_
Group1    -0.576                                                   
Group2    -0.579  0.511                                            
Group3    -0.582  0.522  0.525                                     
log_count  0.013 -0.022 -0.004 -0.017                              
Q7_Q7_1    0.001 -0.031  0.047 -0.009  0.043                       
Q7_Q7_2    0.026  0.000 -0.089 -0.010 -0.023 -0.559                
Q8_Q8_1    0.038 -0.021 -0.051 -0.059 -0.023 -0.076  -0.167        
Q10       -0.007 -0.081  0.095  0.015 -0.021  0.005  -0.140  -0.276
fit.lmer.reduced <- lmer(log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | Student) + ( 1 | phase), data = df, REML=FALSE)
summary(fit.lmer.reduced)
Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 |      Student) + (1 | phase)
   Data: df

     AIC      BIC   logLik deviance df.resid 
  1632.1   1674.2   -807.0   1614.1      791 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.1299 -0.2523  0.0382  0.3871  4.0738 

Random effects:
 Groups   Name        Variance Std.Dev.
 Student  (Intercept) 0.59003  0.7681  
 phase    (Intercept) 0.04643  0.2155  
 Residual             0.26258  0.5124  
Number of obs: 800, groups:  Student, 159; phase, 5

Fixed effects:
              Estimate Std. Error         df t value Pr(>|t|)   
(Intercept)  -0.005204   0.115445   9.391843  -0.045  0.96499   
log_count     0.073916   0.025489 727.808163   2.900  0.00385 **
Q7_Q7_1      -0.224039   0.079672 153.285562  -2.812  0.00557 **
Q7_Q7_2       0.208533   0.081574 151.923280   2.556  0.01156 * 
Q8_Q8_1      -0.071758   0.070034 151.441097  -1.025  0.30718   
Q10           0.138773   0.068365 151.424171   2.030  0.04412 * 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
          (Intr) lg_cnt Q7_Q7_1 Q7_Q7_2 Q8_Q8_
log_count  0.002                              
Q7_Q7_1    0.005  0.041                       
Q7_Q7_2   -0.002 -0.022 -0.556                
Q8_Q8_1    0.002 -0.023 -0.075  -0.172        
Q10        0.002 -0.024 -0.009  -0.126  -0.275
anova(fit.lmer.reduced, fit.lmer)
Data: df
Models:
fit.lmer.reduced: log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | Student) + (1 | phase)
fit.lmer: log_total ~ Group + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | Student) + (1 | phase)
                 npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)  
fit.lmer.reduced    9 1632.1 1674.2 -807.04   1614.1                       
fit.lmer           12 1630.8 1687.0 -803.41   1606.8 7.2588  3    0.06409 .
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

User Requirement Score

# phase is nested within group 
fit.requirement.full <- lmer(log_user_requirement ~  factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | Student) + ( 1 | phase), data = df, REML = FALSE)
Anova(fit.requirement.full, type=3, test.statistics="F")
Analysis of Deviance Table (Type III Wald chisquare tests)

Response: log_user_requirement
               Chisq Df Pr(>Chisq)   
(Intercept)   1.8217  1   0.177116   
factor(Group) 5.4876  3   0.139384   
log_count     8.3467  1   0.003864 **
Q7_Q7_1       8.8566  1   0.002920 **
Q7_Q7_2       3.0003  1   0.083248 . 
Q8_Q8_1       0.0042  1   0.948458   
Q10           3.1256  1   0.077072 . 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
fit.requirement.full
Linear mixed model fit by maximum likelihood  ['lmerModLmerTest']
Formula: log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 +  
    Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | Student) + (1 | phase)
   Data: df
      AIC       BIC    logLik  deviance  df.resid 
1759.1002 1815.3156 -867.5501 1735.1002       788 
Random effects:
 Groups   Name        Std.Dev.
 Student  (Intercept) 0.7193  
 phase    (Intercept) 0.2364  
 Residual             0.5700  
Number of obs: 800, groups:  Student, 159; phase, 5
Fixed Effects:
   (Intercept)  factor(Group)1  factor(Group)2  factor(Group)3       log_count         Q7_Q7_1         Q7_Q7_2  
      -0.22184         0.31507         0.15640         0.36937         0.08078        -0.22668         0.13533  
       Q8_Q8_1             Q10  
      -0.00432         0.11695  
# histograms for two factors
boxplot(log_user_requirement ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_user_requirement")

with(df, interaction.plot(Group, phase, log_user_requirement, ylim=c(0, max(log_user_requirement)))) # interaction plot

# phase is nested within group 
fit.requirement <- lmer(log_user_requirement ~  log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | Student) + ( 1 | phase), data = df, REML = FALSE)
Anova(fit, type=3, test.statistics="F")
Anova Table (Type III tests)

Response: log_total
            Sum Sq  Df  F value    Pr(>F)    
(Intercept)   0.00   1   0.0000  1.000000    
log_count    91.62   1 110.3216 < 2.2e-16 ***
Q7_Q7_1      17.39   1  20.9366 5.507e-06 ***
Q7_Q7_2      16.97   1  20.4327 7.117e-06 ***
Q8_Q8_1       4.90   1   5.9015  0.015349 *  
Q10          10.59   1  12.7473  0.000378 ***
Residuals   659.37 794                       
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
fit.requirement
Linear mixed model fit by maximum likelihood  ['lmerModLmerTest']
Formula: log_user_requirement ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 +      Q10 + (1 | Student) + (1 | phase)
   Data: df
      AIC       BIC    logLik  deviance  df.resid 
1758.4979 1800.6594 -870.2489 1740.4979       791 
Random effects:
 Groups   Name        Std.Dev.
 Student  (Intercept) 0.7331  
 phase    (Intercept) 0.2366  
 Residual             0.5700  
Number of obs: 800, groups:  Student, 159; phase, 5
Fixed Effects:
(Intercept)    log_count      Q7_Q7_1      Q7_Q7_2      Q8_Q8_1          Q10  
  -0.006829     0.081058    -0.220287     0.132015     0.002427     0.125682  
plot(resid(m, type = "pearson") ~ fitted(m))

qqnorm(resid(m, type = "pearson"))
qqline(resid(m, type = "pearson"))

anova(fit.requirement, fit.requirement.full)
Data: df
Models:
fit.requirement: log_user_requirement ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | Student) + (1 | phase)
fit.requirement.full: log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + (1 | Student) + (1 | phase)
                     npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
fit.requirement         9 1758.5 1800.7 -870.25   1740.5                     
fit.requirement.full   12 1759.1 1815.3 -867.55   1735.1 5.3976  3     0.1449
---
title: "R Notebook"
output:
  pdf_document: default
  html_notebook: default
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 

```{r}
# library for LMM 

library(lme4)
library(lmerTest)
library(car)
```



```{r}
df<-read.csv("input/scores_commits.csv", header =TRUE, sep=",")
df <- df[complete.cases(df), ]                  # Apply complete.cases function
df
```

```{r}
# convert to nominal factor
df$Group = factor(df$Group)
df$phase = factor(df$phase)
```


```{r}
df$log_novelty <- log(df$novelty+1) 
df$log_user_requirement <- log(df$user.requirement+1)
df$log_infovis <- log(df$infovis+1)
df$log_total <- log(df$total+1)
df$log_count <- log(df$count+1)
df$Q7_Q7_1 <- log(df$Q7_Q7_1+1)
df$Q7_Q7_2 <- log(df$Q7_Q7_2+1)
df$Q8_Q8_1 <- log(df$Q8_Q8_1+1)
df$Q10 <- log(df$Q10+1)
```


```{r}
# standardizing variables for skills and aspirations. 
cols <- c("Q7_Q7_1", "Q7_Q7_2", "Q8_Q8_1", "Q10", "log_novelty", "log_user_requirement", "log_infovis", "log_total", "log_count")
df[cols] <- scale(df[cols])
df
```
```{r}
mod.reduce.novelty <- lm( log_novelty ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data = df)
summary(mod.reduce.novelty)
AIC(mod.reduce.novelty)
BIC(mod.reduce.novelty)

```


```{r}
mod.full.novelty <- lm( log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data = df)
summary(mod.full.novelty)
AIC(mod.full.novelty)
BIC(mod.full.novelty)

```

```{r}
anova(mod.reduce.novelty, mod.full.novelty)
```
```{r}
library(ALSM)
step(lm(log_novelty ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data=df),
method="both", trace = 1 )
```

```{r}
mod.reduce.ur <- lm(log_user_requirement ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(mod.reduce.ur)
AIC(mod.reduce.ur)
BIC(mod.reduce.ur)
```

```{r}
mod.full.ur <- lm(log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(mod.full.ur)
AIC(mod.full.ur)
BIC(mod.full.ur)
```
```{r}
anova(mod.reduce.ur, mod.full.ur)
```
```{r}
library(ALSM)
step(lm(log_user_requirement ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data=df),
method="both", trace = 1 )
```



```{r}
mod.reduce.vis <- lmer( log_infovis ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | Student) + ( 1 | phase) , data = df, REML = FALSE)
summary(mod.reduce.vis)
AIC(mod.reduce.vis)
BIC(mod.reduce.vis)
```

```{r}
mod.full.vis <- lmer( log_infovis ~ factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | Student) + ( 1 | phase) , data = df, REML = FALSE)
summary(mod.full.vis)
AIC(mod.full.vis)
BIC(mod.full.vis)
```
```{r}
anova(mod.reduce.vis, mod.full.vis)
```
```{r}
library(ALSM)
step(lm(log_infovis ~ Group + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data=df),
method="both", trace = 1 )
```
```{r}
mod.reduce.total <- lm( log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(mod.reduce.total)
AIC(mod.reduce.total)
BIC(mod.reduce.total)
```


```{r}
mod.full.total <- lm( log_total ~ Group + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(mod.full.total)
AIC(mod.full.total)
BIC(mod.full.total)
```
```{r}
anova(mod.reduce.total, mod.full.total)
```

```{r}
# convert to nominal factor
df$Group = factor(df$Group)
df$phase = factor(df$phase)
```

```{r}
library(plyr)
ddply(df, ~ Group * phase, function(data) summary(data$log_novelty) )
ddply(df, ~ Group * phase, summarise, log_novelty.mean=mean(log_novelty), log_novelty.sd = sd(log_novelty))
```
```{r}
# histograms for two factors
boxplot(log_novelty ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_novelty")
with(df, interaction.plot(Group, phase, log_novelty, ylim=c(0, max(log_novelty)))) # interaction plot
```

```{r}
# histograms for two factors
boxplot(log_novelty ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_user_requirement")
with(df, interaction.plot(Group, phase, log_user_requirement, ylim=c(0, max(log_user_requirement)))) # interaction plot
```

```{r}
# histograms for two factors
boxplot(log_novelty ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_infovis")
with(df, interaction.plot(Group, phase, log_infovis, ylim=c(0, max(log_infovis)))) # interaction plot
```

```{r}
# histograms for two factors
boxplot(log_novelty ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_total")
with(df, interaction.plot(Group, phase, log_total, ylim=c(0, max(log_total)))) # interaction plot
```






```{r}
m = lmer(log_novelty ~ Group + (1|Student), data=df, REML=FALSE)
summary(m)
```
```{r}
plot(resid(m, type = "pearson") ~ fitted(m))
qqnorm(resid(m, type = "pearson"))
qqline(resid(m, type = "pearson"))
```
```{r}
# library for LMM we will use on relational log_novelty 

library(lme4)
library(lmerTest)
library(car)
```

# set sum-to-zero contrast for ANOVA cells 

```{r}
contrasts(df$Group) <= "contr.sum"
contrasts(df$phase) <= "contr.sum"
```

```{r}
# phase is nested within group 
fit <- lm(log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 , data = df)
summary(fit)
```

```{r}
library(multcomp)
library(lsmeans)
#summary(glht(fit, lsm(pairwise ~ roup / phase)), test = adjusted(type='holm'))
```

```{r}
fit.full <- lm(log_total ~ Group  + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10, data = df)
summary(fit.full)
```

```{r}
anova(fit, fit.full)
```
```{r}
# histograms for two factors
boxplot(log_total ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_total")
with(df, interaction.plot(Group, phase, log_total, ylim=c(0, max(log_total)))) # interaction plot
```
```{r}
fit.lmer <- lmer(log_total ~ (1 | Group) + (1| phase:Group), data = df, REML= FALSE)
summary(fit.lmer)
```



```{r}
fit.lmer <- lmer(log_total ~ Group  + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | Student ) + ( 1 | phase), data = df, REML= FALSE)
summary(fit.lmer)
```


```{r}
fit.lmer.reduced <- lmer(log_total ~ log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | Student) + ( 1 | phase), data = df, REML=FALSE)
summary(fit.lmer.reduced)
```

```{r}
anova(fit.lmer.reduced, fit.lmer)
```


# User Requirement Score

```{r}
# phase is nested within group 
fit.requirement.full <- lmer(log_user_requirement ~  factor(Group) + log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | Student) + ( 1 | phase), data = df, REML = FALSE)
Anova(fit.requirement.full, type=3, test.statistics="F")
fit.requirement.full
```

```{r}
# histograms for two factors
boxplot(log_user_requirement ~ Group * phase, data = df, xlab="Group.Phase", ylab="log_user_requirement")
with(df, interaction.plot(Group, phase, log_user_requirement, ylim=c(0, max(log_user_requirement)))) # interaction plot
```


```{r}
# phase is nested within group 
fit.requirement <- lmer(log_user_requirement ~  log_count + Q7_Q7_1 + Q7_Q7_2 + Q8_Q8_1 + Q10 + ( 1 | Student) + ( 1 | phase), data = df, REML = FALSE)
Anova(fit, type=3, test.statistics="F")
fit.requirement
```

```{r}
plot(resid(m, type = "pearson") ~ fitted(m))
qqnorm(resid(m, type = "pearson"))
qqline(resid(m, type = "pearson"))
```

```{r}
anova(fit.requirement, fit.requirement.full)
```
 
